{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "b529f5bf-9496-4d81-abd9-9a5d33f006c7",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import json\n",
    "import pickle\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "bcfecb44-34d5-4bfd-898e-39875509fa5c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "490\n"
     ]
    }
   ],
   "source": [
    "bid_to_tbip_speeches = pickle.load(open('speeches_results/bid_to_tbip_floor_speeches.pkl',\n",
    "                                        'rb'))\n",
    "print(len(bid_to_tbip_speeches)) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "f6603f2e-2926-4cfd-869b-071de4fff242",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "471\n"
     ]
    }
   ],
   "source": [
    "bid_to_tbip_tweets = pickle.load(open('tweets_results/bid_to_tbip_tweets.pkl',\n",
    "                                        'rb'))\n",
    "print(len(bid_to_tbip_tweets))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "d1a638c0-b740-4942-b8d8-15c9ec1e53cb",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "509\n"
     ]
    }
   ],
   "source": [
    "bioguide_ids_with_speech_or_tweet_tbip = sorted(list(set(bid_to_tbip_speeches.keys()).union(set(bid_to_tbip_tweets.keys()))))\n",
    "print(len(bioguide_ids_with_speech_or_tweet_tbip))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "a5371c73-3866-45db-9783-d14bb2036503",
   "metadata": {},
   "outputs": [],
   "source": [
    "## House members data for Congress 115/116,\n",
    "## crucially contains info on caucus memberships and leadership positions along with host\n",
    "## of other info - we refer to them as caucus data files in this code. \n",
    "\n",
    "legis_data_115 = pd.read_excel('supporting_data_files/H115_members.xlsx', \n",
    "                               sheet_name=None,\n",
    "                               engine='openpyxl')\n",
    "legis_data_115 = legis_data_115['H115_members (1)']\n",
    "legis_data_115 = legis_data_115[legis_data_115['bioguide_id'].notnull()]\n",
    "\n",
    "legis_data_116 = pd.read_excel('supporting_data_files/H116_members.xlsx', \n",
    "                               sheet_name=None, \n",
    "                               engine='openpyxl')\n",
    "legis_data_116 = legis_data_116['H116_members']\n",
    "legis_data_116 = legis_data_116[legis_data_116['bioguide_id'].notnull()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "a90df0a1-b51c-42dc-a732-bc9d3cf7bf3f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "450\n",
      "437\n"
     ]
    }
   ],
   "source": [
    "all_caucus_data_115_ids = list(legis_data_115['bioguide_id'])\n",
    "print(len(all_caucus_data_115_ids))\n",
    "all_caucus_data_116_ids = list(legis_data_116['bioguide_id'])\n",
    "print(len(all_caucus_data_116_ids))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "a76c1974-481b-4b06-9eef-e4ea1fc3a802",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "503\n"
     ]
    }
   ],
   "source": [
    "s = set(all_caucus_data_115_ids).union(set(all_caucus_data_116_ids))\n",
    "final_bids_to_consider = []\n",
    "for bid in bioguide_ids_with_speech_or_tweet_tbip:\n",
    "    if bid in s:\n",
    "        final_bids_to_consider.append(bid)\n",
    "print(len(final_bids_to_consider))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c4c4eaa4-f56f-422e-ba21-67ca499244e9",
   "metadata": {},
   "source": [
    "### Loading and adding basic biographical information about legislators"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "0fa4b852-e797-423e-a5df-ae76e7bb6cad",
   "metadata": {},
   "outputs": [],
   "source": [
    "legis_info = json.load(open('supporting_data_files/legislator-info-1990-2020.json'))\n",
    "legis_id_to_info = {}\n",
    "for x in legis_info:\n",
    "    legis_id_to_info[x['id']['bioguide']] = x\n",
    "del legis_info"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "303ebbfd-fa49-4b78-8881-89dcb87eec87",
   "metadata": {},
   "outputs": [],
   "source": [
    "bid_to_name = {}\n",
    "bid_to_gender = {}\n",
    "bid_to_party = {}\n",
    "bid_to_birth_year = {}\n",
    "bid_to_seniority = {} #number of terms in the US House\n",
    "bid_to_state_district = {}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "3f4212ba-2814-4aa6-b211-d8e837740a4c",
   "metadata": {},
   "outputs": [],
   "source": [
    "for bid in final_bids_to_consider:\n",
    "    x = legis_id_to_info[bid]\n",
    "    if 'ballotpedia' in x['id']:\n",
    "        bid_to_name[bid] = x['id']['ballotpedia']\n",
    "    else:\n",
    "        bid_to_name[bid] = x['id']['wikipedia']\n",
    "    bid_to_party[bid] = x['terms'][0]['party']\n",
    "    bid_to_gender[bid] = x['bio']['gender']\n",
    "    if bid in all_caucus_data_115_ids:\n",
    "        bid_to_birth_year[bid] = list(legis_data_115[legis_data_115['bioguide_id']==bid]['born'])[0]\n",
    "    else:\n",
    "        bid_to_birth_year[bid] = int(list(legis_data_116[legis_data_116['bioguide_id']==bid]['born'])[0])\n",
    "    terms = [z for z in x['terms'] if z['type']=='rep' and int(z['end'][:4])<=2021]\n",
    "    bid_to_seniority[bid] = len(terms)\n",
    "    s = terms[-1]['state']\n",
    "    try:\n",
    "        d = terms[-1]['district']\n",
    "    except:\n",
    "        print(bid)\n",
    "        break\n",
    "    if d==0:\n",
    "        bid_to_state_district[bid] = s + '-AL'\n",
    "    elif d<10:\n",
    "        bid_to_state_district[bid] = s + '-0' + str(d)\n",
    "    else:\n",
    "        bid_to_state_district[bid] = s + '-' + str(d)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f6d53e98-bbc3-48bf-bf5b-c191cf18d6e6",
   "metadata": {},
   "source": [
    "### Loading and adding data on % of district pop. that voted for Dem/GOP candidate in 2016 presidential election"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "id": "cd7992ce-9b6c-4e90-9763-74964df62f47",
   "metadata": {},
   "outputs": [],
   "source": [
    "bid_to_district_pres_vs = {} #for dem candidate, GE 2016\n",
    "bid_to_district_gop_vs = {} #for gop candidate, GE 2016"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "id": "588648a2-c3fb-4eb4-a566-3916478a624c",
   "metadata": {},
   "outputs": [],
   "source": [
    "house_election_shares_df = pd.read_csv('supporting_data_files/Daily Kos Elections 2008, 2012 & 2016 presidential election results for congressional districts used in 2020 elections - Results.csv',\n",
    "                                       skiprows=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "id": "e775d2f6-b3b3-48d7-92f8-125af337fb30",
   "metadata": {},
   "outputs": [],
   "source": [
    "district_to_pres_vs = dict(zip(house_election_shares_df.CD, house_election_shares_df.Clinton))\n",
    "district_to_gop_vs = dict(zip(house_election_shares_df.CD, house_election_shares_df.Trump))\n",
    "del house_election_shares_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "id": "b17d26f9-0b41-44c7-b556-caad89e251f3",
   "metadata": {},
   "outputs": [],
   "source": [
    "for bid in final_bids_to_consider:\n",
    "    sd = bid_to_state_district[bid]\n",
    "    bid_to_district_pres_vs[bid] = district_to_pres_vs[sd]\n",
    "    bid_to_district_gop_vs[bid] = district_to_gop_vs[sd]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ff63d6a6-bbc3-4e86-914e-d71f3fc2d691",
   "metadata": {},
   "source": [
    "### Loading and adding data for DW-NOMINATE scores, caucus memberships, and leadership positons"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "id": "11fa339d-9fcb-49cf-b3c2-929112b4c77f",
   "metadata": {},
   "outputs": [],
   "source": [
    "bid_to_dwnom1 = {}\n",
    "bid_to_dwnom2 = {}\n",
    "bid_to_cong_presence_115 = {}\n",
    "bid_to_progressive_115 = {}\n",
    "bid_to_bluedog_115 = {}\n",
    "bid_to_newdem_115 = {}\n",
    "bid_to_problemsolvers_115 = {}\n",
    "bid_to_freedom_115 = {}\n",
    "bid_to_rsc_115 = {}\n",
    "bid_to_gop_lead_115 = {}\n",
    "bid_to_dem_lead_115 = {}\n",
    "bid_to_top_comm_115 = {}\n",
    "bid_to_comm_chair_115 = {}\n",
    "bid_to_cong_presence_116 = {}\n",
    "bid_to_progressive_116 = {}\n",
    "bid_to_bluedog_116 = {}\n",
    "bid_to_newdem_116 = {}\n",
    "bid_to_problemsolvers_116 = {}\n",
    "bid_to_freedom_116 = {}\n",
    "bid_to_rsc_116 = {}\n",
    "bid_to_gop_lead_116 = {}\n",
    "bid_to_dem_lead_116 = {}\n",
    "bid_to_top_comm_116 = {}\n",
    "bid_to_comm_chair_116 = {}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "id": "bf00d765-2082-4294-9ca7-e683420a5a4e",
   "metadata": {},
   "outputs": [],
   "source": [
    "all_116_ids = all_caucus_data_116_ids[:]\n",
    "all_115_ids = all_caucus_data_115_ids[:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "id": "fb65e522-01ee-4c04-8552-8f7c5c83be6e",
   "metadata": {},
   "outputs": [],
   "source": [
    "for bid in final_bids_to_consider:\n",
    "    present_116, present_115 = 0, 0\n",
    "    if bid in all_116_ids:\n",
    "        present_116 = 1\n",
    "        df116 = legis_data_116[legis_data_116['bioguide_id']==bid]\n",
    "    if bid in all_115_ids:\n",
    "        present_115 = 1\n",
    "        df115 = legis_data_115[legis_data_115['bioguide_id']==bid]\n",
    "    if present_116:\n",
    "        bid_to_dwnom1[bid] = list(df116['nominate_dim1'])[0]\n",
    "        bid_to_dwnom2[bid] = list(df116['nominate_dim2'])[0]\n",
    "    else:\n",
    "        bid_to_dwnom1[bid] = list(df115['nominate_dim1'])[0]\n",
    "        bid_to_dwnom2[bid] = list(df115['nominate_dim2'])[0]\n",
    "    bid_to_cong_presence_115[bid] = present_115\n",
    "    bid_to_cong_presence_116[bid] = present_116\n",
    "    \n",
    "    if present_115 and list(df115['progressive'])[0]==1:\n",
    "        bid_to_progressive_115[bid] = 1\n",
    "    else:\n",
    "        bid_to_progressive_115[bid] = 0\n",
    "    if present_115 and list(df115['bluedog'])[0]==1:\n",
    "        bid_to_bluedog_115[bid] = 1\n",
    "    else:\n",
    "        bid_to_bluedog_115[bid] = 0\n",
    "    if present_115 and list(df115['newdems'])[0]==1:\n",
    "        bid_to_newdem_115[bid] = 1\n",
    "    else:\n",
    "        bid_to_newdem_115[bid] = 0\n",
    "    if present_115 and list(df115['freedom'])[0]==1:\n",
    "        bid_to_freedom_115[bid] = 1\n",
    "    else:\n",
    "        bid_to_freedom_115[bid] = 0\n",
    "    if present_115 and list(df115['rsc'])[0]==1:\n",
    "        bid_to_rsc_115[bid] = 1\n",
    "    else:\n",
    "        bid_to_rsc_115[bid] = 0\n",
    "    if present_115 and list(df115['problemsolvers'])[0]==1:\n",
    "        bid_to_problemsolvers_115[bid] = 1\n",
    "    else:\n",
    "        bid_to_problemsolvers_115[bid] = 0\n",
    "    if present_115 and list(df115['GOPleadership'])[0]==1:\n",
    "        bid_to_gop_lead_115[bid] = 1\n",
    "    else:\n",
    "        bid_to_gop_lead_115[bid] = 0\n",
    "    if present_115 and list(df115['DEMleadership'])[0]==1:\n",
    "        bid_to_dem_lead_115[bid] = 1\n",
    "    else:\n",
    "        bid_to_dem_lead_115[bid] = 0\n",
    "    if present_115 and list(df115['CommitteeChair'])[0]==1:\n",
    "        bid_to_comm_chair_115[bid] = 1\n",
    "    else:\n",
    "        bid_to_comm_chair_115[bid] = 0\n",
    "    if present_115 and list(df115['TopCommittee'])[0]==1:\n",
    "        bid_to_top_comm_115[bid] = 1\n",
    "    else:\n",
    "        bid_to_top_comm_115[bid] = 0\n",
    "    \n",
    "    if present_116 and list(df116['Progressive'])[0]==1:\n",
    "        bid_to_progressive_116[bid] = 1\n",
    "    else:\n",
    "        bid_to_progressive_116[bid] = 0\n",
    "    if present_116 and list(df116['BlueDog'])[0]==1:\n",
    "        bid_to_bluedog_116[bid] = 1\n",
    "    else:\n",
    "        bid_to_bluedog_116[bid] = 0\n",
    "    if present_116 and list(df116['NewDemocrat'])[0]==1:\n",
    "        bid_to_newdem_116[bid] = 1\n",
    "    else:\n",
    "        bid_to_newdem_116[bid] = 0\n",
    "    if present_116 and list(df116['Freedom'])[0]==1:\n",
    "        bid_to_freedom_116[bid] = 1\n",
    "    else:\n",
    "        bid_to_freedom_116[bid] = 0\n",
    "    if present_116 and list(df116['RSC'])[0]==1:\n",
    "        bid_to_rsc_116[bid] = 1\n",
    "    else:\n",
    "        bid_to_rsc_116[bid] = 0\n",
    "    if present_116 and list(df116['ProblemSolvers'])[0]==1:\n",
    "        bid_to_problemsolvers_116[bid] = 1\n",
    "    else:\n",
    "        bid_to_problemsolvers_116[bid] = 0\n",
    "    if present_116 and list(df116['GOPLeadership'])[0]==1:\n",
    "        bid_to_gop_lead_116[bid] = 1\n",
    "    else:\n",
    "        bid_to_gop_lead_116[bid] = 0\n",
    "    if present_116 and list(df116['DemLeadership'])[0]==1:\n",
    "        bid_to_dem_lead_116[bid] = 1\n",
    "    else:\n",
    "        bid_to_dem_lead_116[bid] = 0\n",
    "    if present_116 and list(df116['CommitteeChair'])[0]==1:\n",
    "        bid_to_comm_chair_116[bid] = 1\n",
    "    else:\n",
    "        bid_to_comm_chair_116[bid] = 0\n",
    "    if present_116 and list(df116['TopCommittee'])[0]==1:\n",
    "        bid_to_top_comm_116[bid] = 1\n",
    "    else:\n",
    "        bid_to_top_comm_116[bid] = 0"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "75931bb7-339e-4908-b232-7851fb702910",
   "metadata": {},
   "source": [
    "### Loading and adding data for Legislator's % vote share 2016/2018 House election in their district"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "id": "a208e830-41f6-46b2-84bd-077fea151b04",
   "metadata": {},
   "outputs": [],
   "source": [
    "house_elec_results = pd.read_csv('supporting_data_files/1976-2018-house3.csv', \n",
    "                                 encoding='ISO-8859-1')\n",
    "house_elec_results = house_elec_results[house_elec_results['year']>=2016]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "id": "69c1929a-d65d-4f81-b2dc-ff169a9f0117",
   "metadata": {},
   "outputs": [],
   "source": [
    "bid_to_house_elec_vote_share_2016 = {}\n",
    "bid_to_house_elec_vote_share_2018 = {}\n",
    "for bid in final_bids_to_consider:\n",
    "    \n",
    "    lastname = legis_id_to_info[bid]['name']['last']\n",
    "    party = bid_to_party[bid].upper()\n",
    "    terms = legis_id_to_info[bid]['terms']\n",
    "    name = bid_to_name[bid].split(' (')[0]\n",
    "    \n",
    "    if bid_to_cong_presence_115[bid]:\n",
    "        for t in terms:\n",
    "            if t['start'][:4] == '2017':\n",
    "                term = t\n",
    "                break\n",
    "        state = term['state']\n",
    "        district = term['district']\n",
    "        df2016 = house_elec_results[house_elec_results['state_po']==state]\n",
    "        df2016 = df2016[df2016['district']==district]\n",
    "        df = df2016[df2016['party'].str.contains(party, na=False)]\n",
    "        if len(df):\n",
    "            try:\n",
    "                voteshare = 100*(sum(df['candidatevotes'])/sum(df['totalvotes']))\n",
    "            except Exception as e:\n",
    "                print(e)\n",
    "                print(bid)\n",
    "                print(party)\n",
    "                break\n",
    "            bid_to_house_elec_vote_share_2016[bid] = voteshare\n",
    "        else:\n",
    "            df = df2016[df2016['candidate']==name.upper()]\n",
    "            try:\n",
    "                voteshare = 100*(sum(df['candidatevotes'])/sum(df['totalvotes']))\n",
    "            except Exception as e:\n",
    "                print(e)\n",
    "                print(bid)\n",
    "                print(party)\n",
    "                break\n",
    "            bid_to_house_elec_vote_share_2016[bid] = voteshare\n",
    "    else:\n",
    "        bid_to_house_elec_vote_share_2016[bid] = np.nan\n",
    "    if bid_to_cong_presence_116[bid]:\n",
    "        for t in terms:\n",
    "            if t['start'][:4] == '2019':\n",
    "                term = t\n",
    "                break\n",
    "        state = term['state']\n",
    "        district = term['district']\n",
    "        df2018 = house_elec_results[house_elec_results['state_po']==state]\n",
    "        df2018 = df2018[df2018['district']==district]\n",
    "        df2018 = df2018[df2018['year']==2018]\n",
    "        \n",
    "        df = df2018[df2018['party'].str.contains(party, na=False)]\n",
    "        if len(df):\n",
    "            try:\n",
    "                voteshare = 100*(sum(df['candidatevotes'])/sum(df['totalvotes']))\n",
    "            except Exception as e:\n",
    "                print(e)\n",
    "                print(bid)\n",
    "                print(party)\n",
    "                break\n",
    "            bid_to_house_elec_vote_share_2018[bid] = voteshare\n",
    "        else:\n",
    "            df = df2018[df2018['candidate']==name.upper()]\n",
    "            try:\n",
    "                voteshare = 100*(sum(df['candidatevotes'])/sum(df['totalvotes']))\n",
    "            except Exception as e:\n",
    "                print(e)\n",
    "                print(bid)\n",
    "                print(party)\n",
    "                break\n",
    "            bid_to_house_elec_vote_share_2018[bid] = voteshare\n",
    "    else:\n",
    "        bid_to_house_elec_vote_share_2018[bid] = np.nan\n",
    "del house_elec_results"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "60b50b80-6fa1-4cad-ba89-865fcd3c9729",
   "metadata": {},
   "source": [
    "### Loading and adding district-specific data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "id": "bbb1548a-6c9c-440e-ae05-5c88d22e7221",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 435 entries, 0 to 434\n",
      "Data columns (total 6 columns):\n",
      " #   Column            Non-Null Count  Dtype  \n",
      "---  ------            --------------  -----  \n",
      " 0   CD                435 non-null    object \n",
      " 1   Cluster           435 non-null    object \n",
      " 2   Very low density  435 non-null    float64\n",
      " 3   Low density       435 non-null    float64\n",
      " 4   Medium density    435 non-null    float64\n",
      " 5   High density      435 non-null    float64\n",
      "dtypes: float64(4), object(2)\n",
      "memory usage: 20.5+ KB\n",
      "None\n"
     ]
    }
   ],
   "source": [
    "citylab_cdi = pd.read_csv('supporting_data_files/citylab_cdi.csv')\n",
    "print(citylab_cdi.info())\n",
    "# (from https://github.com/theatlantic/citylab-data/blob/master/citylab-congress/citylab_cdi.csv)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "id": "e84466f8-b93b-4337-9653-23ef672ef3df",
   "metadata": {},
   "outputs": [],
   "source": [
    "district_to_density = dict(zip(citylab_cdi.CD, \n",
    "                               citylab_cdi.Cluster))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 106,
   "id": "e6addc00-4ce0-48c3-a512-e315f2baf84e",
   "metadata": {
    "scrolled": true,
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 437 entries, 0 to 436\n",
      "Data columns (total 26 columns):\n",
      " #   Column                  Non-Null Count  Dtype  \n",
      "---  ------                  --------------  -----  \n",
      " 0   id                      437 non-null    object \n",
      " 1   Geographic.Area.Name    437 non-null    object \n",
      " 2   TotalPop                437 non-null    int64  \n",
      " 3   MalePop                 437 non-null    int64  \n",
      " 4   FemalePop               437 non-null    int64  \n",
      " 5   MedianAge               437 non-null    float64\n",
      " 6   SeniorPop               437 non-null    int64  \n",
      " 7   WhitePop                437 non-null    int64  \n",
      " 8   WhitePercent            437 non-null    float64\n",
      " 9   BlackPop                437 non-null    int64  \n",
      " 10  BlackPercent            437 non-null    float64\n",
      " 11  AmIndianPop             437 non-null    int64  \n",
      " 12  AmIndianPercent         437 non-null    float64\n",
      " 13  AsianPop                437 non-null    int64  \n",
      " 14  AsianPercent            437 non-null    float64\n",
      " 15  HispanicPop             437 non-null    int64  \n",
      " 16  HispanicPercent         437 non-null    float64\n",
      " 17  VAP                     437 non-null    int64  \n",
      " 18  DistrictMedianIncome    437 non-null    int64  \n",
      " 19  DistrictMeanIncome      437 non-null    int64  \n",
      " 20  NationalMedian          437 non-null    int64  \n",
      " 21  DifferenceMedianIncome  437 non-null    int64  \n",
      " 22  NationalMean            437 non-null    int64  \n",
      " 23  DifferenceMeanIncome    437 non-null    int64  \n",
      " 24  UneployedPop            437 non-null    int64  \n",
      " 25  UnemploymentRate        437 non-null    float64\n",
      "dtypes: float64(7), int64(17), object(2)\n",
      "memory usage: 88.9+ KB\n",
      "None\n"
     ]
    }
   ],
   "source": [
    "census_115 = pd.read_csv('supporting_data_files/CensusMerge_115.csv')\n",
    "print(census_115.info())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 107,
   "id": "f31d2766-b5b3-483a-886e-e7723e506c18",
   "metadata": {
    "scrolled": true,
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 437 entries, 0 to 436\n",
      "Data columns (total 26 columns):\n",
      " #   Column                  Non-Null Count  Dtype  \n",
      "---  ------                  --------------  -----  \n",
      " 0   id                      437 non-null    object \n",
      " 1   Geographic.Area.Name    437 non-null    object \n",
      " 2   TotalPop                437 non-null    int64  \n",
      " 3   TotalMale               437 non-null    int64  \n",
      " 4   TotalFemale             437 non-null    int64  \n",
      " 5   MedianAge               437 non-null    float64\n",
      " 6   SeniorPop               437 non-null    int64  \n",
      " 7   WhitePop                437 non-null    int64  \n",
      " 8   WhitePercent            437 non-null    float64\n",
      " 9   BlackPop                437 non-null    int64  \n",
      " 10  BlackPercent            437 non-null    float64\n",
      " 11  AmIndianPop             437 non-null    int64  \n",
      " 12  AmIndianPercent         437 non-null    float64\n",
      " 13  AsianPop                437 non-null    int64  \n",
      " 14  AsianPercentage         437 non-null    float64\n",
      " 15  HispanicPop             437 non-null    int64  \n",
      " 16  HispanicPercentage      437 non-null    float64\n",
      " 17  VAP                     437 non-null    int64  \n",
      " 18  DistrictMedianIncome    437 non-null    int64  \n",
      " 19  DistrictMeanIncome      437 non-null    int64  \n",
      " 20  NationalMedian          437 non-null    int64  \n",
      " 21  DifferenceMedianIncome  437 non-null    int64  \n",
      " 22  NationalMean            437 non-null    int64  \n",
      " 23  DifferenceMeanIncome    437 non-null    int64  \n",
      " 24  UnemployedPop           437 non-null    int64  \n",
      " 25  UnemployedRate          437 non-null    float64\n",
      "dtypes: float64(7), int64(17), object(2)\n",
      "memory usage: 88.9+ KB\n",
      "None\n"
     ]
    }
   ],
   "source": [
    "census_116 = pd.read_csv('supporting_data_files/CensusMerge_116.csv')\n",
    "print(census_116.info())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "id": "9bddf3e0-07ab-4e39-b644-5d143abddc03",
   "metadata": {},
   "outputs": [],
   "source": [
    "state_code = pd.read_csv('supporting_data_files/state_abbr_code.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "id": "d674a185-b093-489e-9fd0-93dc2ff809aa",
   "metadata": {},
   "outputs": [],
   "source": [
    "state_to_code = dict(zip(state_code.State, state_code.Code))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 110,
   "id": "4d8690bf-e49c-4f15-a9e1-5d9a04ffa723",
   "metadata": {},
   "outputs": [],
   "source": [
    "def census_geographical_region_to_district(s):\n",
    "    state = s.split(', ')[1]\n",
    "    state_code = state_to_code[state]\n",
    "    if 'at Large' in s:\n",
    "        return state_code + '-AL'\n",
    "    else:\n",
    "        d = int(s.split()[2])\n",
    "        if d < 10:\n",
    "            return state_code + '-0' + str(d)\n",
    "        else:\n",
    "            return state_code + '-' + str(d)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "id": "b95a29ef-ee50-44cc-9189-8cfe606353fa",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_district_to_data_dics_from_census_data_csv(df):\n",
    "    geog_area = list(df['Geographic.Area.Name'])[:-2]\n",
    "    total_pop = list(df['TotalPop'])[:-2]\n",
    "    try:\n",
    "        male_pop = list(df['TotalMale'])[:-2]\n",
    "        female_pop = list(df['TotalFemale'])[:-2]\n",
    "    except:\n",
    "        male_pop = list(df['MalePop'])[:-2]\n",
    "        female_pop = list(df['FemalePop'])[:-2]\n",
    "    senior_pop = list(df['SeniorPop'])[:-2]\n",
    "    \n",
    "    male_percs = [round(100*(x/y), 2) for x,y in zip(male_pop, total_pop)]\n",
    "    female_percs = [round(100*(x/y), 2)  for x,y in zip(female_pop, total_pop)]\n",
    "    senior_percs = [round(100*(x/y), 2)  for x,y in zip(senior_pop, total_pop)]\n",
    "    median_ages = list(df['MedianAge'])[:-2]\n",
    "    white_percs = list(df['WhitePercent'])[:-2]\n",
    "    black_percs = list(df['BlackPercent'])[:-2]\n",
    "    try:\n",
    "        asian_percs = list(df['AsianPercent'])[:-2]\n",
    "    except:\n",
    "        asian_percs = list(df['AsianPercentage'])[:-2]\n",
    "    amindian_percs = list(df['AmIndianPercent'])[:-2]\n",
    "    try:\n",
    "        hispanic_percs = list(df['HispanicPercent'])[:-2]\n",
    "    except:\n",
    "        hispanic_percs = list(df['HispanicPercentage'])[:-2]\n",
    "    try:\n",
    "        unemployment_rates = list(df['UnemployedRate'])[:-2]\n",
    "    except:\n",
    "        unemployment_rates = list(df['UnemploymentRate'])[:-2]\n",
    "    median_incomes = list(df['DistrictMedianIncome'])[:-2]\n",
    "    mean_incomes = list(df['DistrictMeanIncome'])[:-2]\n",
    "    diff_median_incomes = list(df['DifferenceMedianIncome'])[:-2]\n",
    "    diff_mean_incomes = list(df['DifferenceMeanIncome'])[:-2]\n",
    "    \n",
    "    districts = [census_geographical_region_to_district(g) for g in geog_area]\n",
    "    print(len(districts))\n",
    "    \n",
    "    district_to_male_percs = dict(zip(districts, male_percs))\n",
    "    district_to_female_percs = dict(zip(districts, female_percs))\n",
    "    district_to_senior_percs = dict(zip(districts, senior_percs))\n",
    "    district_to_median_ages = dict(zip(districts, median_ages))\n",
    "    district_to_white_percs = dict(zip(districts, white_percs))\n",
    "    district_to_black_percs = dict(zip(districts, black_percs))\n",
    "    district_to_asian_percs = dict(zip(districts, asian_percs))\n",
    "    district_to_amindian_percs = dict(zip(districts, amindian_percs))\n",
    "    district_to_hispanic_percs = dict(zip(districts, hispanic_percs))\n",
    "    district_to_unemployment_rates = dict(zip(districts, unemployment_rates))\n",
    "    district_to_median_incomes = dict(zip(districts, median_incomes))\n",
    "    district_to_mean_incomes = dict(zip(districts, mean_incomes))\n",
    "    district_to_diff_median_incomes = dict(zip(districts, diff_median_incomes))\n",
    "    district_to_diff_mean_incomes = dict(zip(districts, diff_mean_incomes))\n",
    "    \n",
    "    return district_to_male_percs, district_to_female_percs, district_to_senior_percs, district_to_median_ages, district_to_white_percs, district_to_black_percs, district_to_asian_percs, district_to_amindian_percs, district_to_hispanic_percs, district_to_unemployment_rates, district_to_median_incomes, district_to_mean_incomes, district_to_diff_median_incomes, district_to_diff_mean_incomes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "id": "5ee3ca24-bc0b-4d31-805e-7bf24d618ae0",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "435\n"
     ]
    }
   ],
   "source": [
    "district_to_male_percs_115, district_to_female_percs_115, district_to_senior_percs_115, district_to_median_ages_115, district_to_white_percs_115, district_to_black_percs_115, district_to_asian_percs_115, district_to_amindian_percs_115, district_to_hispanic_percs_115, district_to_unemployment_rates_115, district_to_median_incomes_115, district_to_mean_incomes_115, district_to_diff_median_incomes_115, district_to_diff_mean_incomes_115 = get_district_to_data_dics_from_census_data_csv(census_115)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 113,
   "id": "6d21dd0a-c836-4fe3-be0d-c9f0a02d682b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "435\n"
     ]
    }
   ],
   "source": [
    "district_to_male_percs_116, district_to_female_percs_116, district_to_senior_percs_116, district_to_median_ages_116, district_to_white_percs_116, district_to_black_percs_116, district_to_asian_percs_116, district_to_amindian_percs_116, district_to_hispanic_percs_116, district_to_unemployment_rates_116, district_to_median_incomes_116, district_to_mean_incomes_116, district_to_diff_median_incomes_116, district_to_diff_mean_incomes_116 = get_district_to_data_dics_from_census_data_csv(census_116)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 114,
   "id": "d888e6b6-8054-4712-a8bb-aeb96fed98bf",
   "metadata": {},
   "outputs": [],
   "source": [
    "bid_to_district_density = {} #how urban/rural a district is\n",
    "bid_to_district_male_perc = {}\n",
    "bid_to_district_female_perc = {}\n",
    "bid_to_district_senior_perc = {}\n",
    "bid_to_district_median_age = {}\n",
    "bid_to_district_white_perc = {}\n",
    "bid_to_district_black_perc = {}\n",
    "bid_to_district_asian_perc = {}\n",
    "bid_to_district_amindian_perc = {}\n",
    "bid_to_district_hispanic_perc = {}\n",
    "bid_to_district_unemp_rate = {}\n",
    "bid_to_district_median_income = {}\n",
    "bid_to_district_mean_income = {}\n",
    "bid_to_district_diff_median_income = {}\n",
    "bid_to_district_diff_mean_income = {}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 115,
   "id": "a1597fb7-1ea9-41e2-b1b6-7708fa89b429",
   "metadata": {},
   "outputs": [],
   "source": [
    "for bid in final_bids_to_consider:\n",
    "    sd = bid_to_state_district[bid]\n",
    "    bid_to_district_density[bid] = district_to_density[sd]\n",
    "    if bid_to_cong_presence_116[bid]:\n",
    "        bid_to_district_female_perc[bid] = district_to_female_percs_116[sd]\n",
    "        bid_to_district_male_perc[bid] = district_to_male_percs_116[sd]\n",
    "        bid_to_district_senior_perc[bid] = district_to_senior_percs_116[sd]\n",
    "        bid_to_district_median_age[bid] = district_to_median_ages_116[sd]\n",
    "        bid_to_district_white_perc[bid] = district_to_white_percs_116[sd]\n",
    "        bid_to_district_black_perc[bid] = district_to_black_percs_116[sd]\n",
    "        bid_to_district_asian_perc[bid] = district_to_asian_percs_116[sd]\n",
    "        bid_to_district_amindian_perc[bid] = district_to_amindian_percs_116[sd]\n",
    "        bid_to_district_hispanic_perc[bid] = district_to_hispanic_percs_116[sd]\n",
    "        bid_to_district_unemp_rate[bid] = district_to_unemployment_rates_116[sd]\n",
    "        bid_to_district_median_income[bid] = district_to_median_incomes_116[sd]\n",
    "        bid_to_district_diff_median_income[bid] = district_to_diff_median_incomes_116[sd]\n",
    "        bid_to_district_mean_income[bid] = district_to_mean_incomes_116[sd]\n",
    "        bid_to_district_diff_mean_income[bid] = district_to_diff_mean_incomes_116[sd]\n",
    "    else:\n",
    "        bid_to_district_female_perc[bid] = district_to_female_percs_115[sd]\n",
    "        bid_to_district_male_perc[bid] = district_to_male_percs_115[sd]\n",
    "        bid_to_district_senior_perc[bid] = district_to_senior_percs_115[sd]\n",
    "        bid_to_district_median_age[bid] = district_to_median_ages_115[sd]\n",
    "        bid_to_district_white_perc[bid] = district_to_white_percs_115[sd]\n",
    "        bid_to_district_black_perc[bid] = district_to_black_percs_115[sd]\n",
    "        bid_to_district_asian_perc[bid] = district_to_asian_percs_115[sd]\n",
    "        bid_to_district_amindian_perc[bid] = district_to_amindian_percs_115[sd]\n",
    "        bid_to_district_hispanic_perc[bid] = district_to_hispanic_percs_115[sd]\n",
    "        bid_to_district_unemp_rate[bid] = district_to_unemployment_rates_115[sd]\n",
    "        bid_to_district_median_income[bid] = district_to_median_incomes_115[sd]\n",
    "        bid_to_district_diff_median_income[bid] = district_to_diff_median_incomes_115[sd]\n",
    "        bid_to_district_mean_income[bid] = district_to_mean_incomes_115[sd]\n",
    "        bid_to_district_diff_mean_income[bid] = district_to_diff_mean_incomes_115[sd]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a2fa345a-c6e1-4c9d-90d1-4efe036220ea",
   "metadata": {},
   "source": [
    "### Adding ideal point values data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 117,
   "id": "4a1a2061-a71c-4d51-83ad-69ab0a4ef061",
   "metadata": {},
   "outputs": [],
   "source": [
    "def standardize(x):\n",
    "  \"\"\"Standardize a vector x.\"\"\"\n",
    "  return (x - np.nanmean(x)) / np.nanstd(x)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 138,
   "id": "eaffb42a-110b-4740-aae0-ce0fbe5db905",
   "metadata": {},
   "outputs": [],
   "source": [
    "bid_to_speech_tbip, bid_to_twitter_tbip = {}, {}\n",
    "for bid in final_bids_to_consider:\n",
    "    if bid in bid_to_tbip_speeches:\n",
    "        bid_to_speech_tbip[bid] = -1* bid_to_tbip_speeches[bid]\n",
    "    else:\n",
    "        bid_to_speech_tbip[bid] = np.nan\n",
    "        \n",
    "    if bid in bid_to_tbip_tweets:\n",
    "        bid_to_twitter_tbip[bid] = bid_to_tbip_tweets[bid]\n",
    "    else:\n",
    "        bid_to_twitter_tbip[bid] = np.nan"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 141,
   "id": "f0c6749e-90ab-4f5e-ae4a-4b405cb4d622",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 145,
   "id": "dd53260c-e532-422d-8133-2dce798d334c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "503\n",
      "503\n",
      "503\n"
     ]
    }
   ],
   "source": [
    "vote_source_dir = 'tbip/data/congs_115-116_votes/'\n",
    "vote_data_dir = os.path.join(vote_source_dir, \"clean\")\n",
    "vote_param_dir = os.path.join(vote_source_dir, \"fits/params\")\n",
    "vote_ideal_points_1d = standardize(np.load(os.path.join(vote_param_dir, \n",
    "                                                        \"ideal_point_loc.npy\")))\n",
    "voting_reps_map = list(map(lambda x:x.rstrip(), \n",
    "                           open(os.path.join(vote_data_dir, 'rep_map.txt')).readlines()))\n",
    "\n",
    "bid_to_stan_vote_tbip = {}\n",
    "for bid in final_bids_to_consider:\n",
    "    bid_to_stan_vote_tbip[bid] = -1*vote_ideal_points_1d[voting_reps_map.index(bid)]\n",
    "\n",
    "stan_speech_ideal_points = standardize(list(bid_to_speech_tbip.values()))\n",
    "stan_tweet_ideal_points = standardize(list(bid_to_twitter_tbip.values()))\n",
    "\n",
    "bid_to_stan_speech_tbip, bid_to_stan_tweet_tbip = {}, {}\n",
    "for i, bid in enumerate(final_bids_to_consider):\n",
    "    bid_to_stan_speech_tbip[bid] = stan_speech_ideal_points[i]\n",
    "    bid_to_stan_tweet_tbip[bid] = stan_tweet_ideal_points[i]\n",
    "\n",
    "print(len(bid_to_stan_vote_tbip))\n",
    "print(len(bid_to_stan_speech_tbip))\n",
    "print(len(bid_to_stan_tweet_tbip))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 149,
   "id": "d41eea74-cd77-470f-8a5b-8ca22d2c6bdf",
   "metadata": {},
   "outputs": [],
   "source": [
    "final_df = pd.DataFrame()\n",
    "final_df['Bioguide_ID'] = list(bid_to_name.keys())\n",
    "final_df['Name'] = list(bid_to_name.values())\n",
    "final_df['Gender'] = list(bid_to_gender.values())\n",
    "final_df['Party'] = list(bid_to_party.values())\n",
    "final_df['Born'] = list(bid_to_birth_year.values())\n",
    "final_df['Number_of_House_Terms'] = list(bid_to_seniority.values())\n",
    "\n",
    "final_df['Present_Cong115'] = list(bid_to_cong_presence_115.values())\n",
    "final_df['Present_Cong116'] = list(bid_to_cong_presence_116.values())\n",
    "\n",
    "final_df['House_Election_Candidate_Vote_Share_2016'] = list(bid_to_house_elec_vote_share_2016.values())\n",
    "final_df['House_Election_Candidate_Vote_Share_2018'] = list(bid_to_house_elec_vote_share_2018.values())\n",
    "\n",
    "final_df['District'] = list(bid_to_state_district.values())\n",
    "final_df['District_Presidential_VoteShare_Dem2016'] = list(bid_to_district_pres_vs.values())\n",
    "final_df['District_Presidential_VoteShare_GOP2016'] = list(bid_to_district_gop_vs.values())\n",
    "final_df['District_Density'] = list(bid_to_district_density.values())\n",
    "final_df['District_Percent_Female'] = list(bid_to_district_female_perc.values())\n",
    "final_df['District_Percent_Male'] = list(bid_to_district_male_perc.values())\n",
    "final_df['District_Percent_Senior'] = list(bid_to_district_senior_perc.values())\n",
    "final_df['District_Median_Age'] = list(bid_to_district_median_age.values())\n",
    "final_df['District_Percent_White'] = list(bid_to_district_white_perc.values())\n",
    "final_df['District_Percent_Black'] = list(bid_to_district_black_perc.values())\n",
    "final_df['District_Percent_Asian'] = list(bid_to_district_asian_perc.values())\n",
    "final_df['District_Percent_Hispanic'] = list(bid_to_district_hispanic_perc.values())\n",
    "final_df['District_Percent_AmericanIndian'] = list(bid_to_district_amindian_perc.values())\n",
    "final_df['District_Mean_Income'] = list(bid_to_district_mean_income.values())\n",
    "final_df['District_Mean_Minus_National_Mean_Income'] = list(bid_to_district_diff_mean_income.values())\n",
    "final_df['District_Median_Income'] = list(bid_to_district_median_income.values())\n",
    "final_df['District_Median_Minus_National_Median_Income'] = list(bid_to_district_diff_median_income.values())\n",
    "final_df['District_Unemployment_Rate'] = list(bid_to_district_unemp_rate.values())\n",
    "      \n",
    "final_df['DW-Nominate_1'] = list(bid_to_dwnom1.values())\n",
    "final_df['DW-Nominate_2'] = list(bid_to_dwnom2.values())\n",
    "final_df['TBIP_Floor_Speeches'] = list(bid_to_speech_tbip.values())\n",
    "final_df['TBIP_Tweets'] = list(bid_to_twitter_tbip.values())\n",
    "final_df['Standardized_Vote_Ideal_Point'] = list(bid_to_stan_vote_tbip.values())\n",
    "final_df['Standardized_Speech_Ideal_Point'] = list(bid_to_stan_speech_tbip.values())\n",
    "final_df['Standardized_Tweet_Ideal_Point'] = list(bid_to_stan_tweet_tbip.values())\n",
    "\n",
    "final_df['Progressive_Caucus_Cong115'] = list(bid_to_progressive_115.values()) \n",
    "final_df['NewDems_Caucus_Cong115'] = list(bid_to_newdem_115.values()) \n",
    "final_df['BlueDog_Caucus_Cong115'] = list(bid_to_bluedog_115.values()) \n",
    "final_df['ProblemSolvers_Caucus_Cong115'] = list(bid_to_problemsolvers_115.values()) \n",
    "final_df['RSC_Caucus_Cong115'] = list(bid_to_rsc_115.values()) \n",
    "final_df['Freedom_Caucus_Cong115'] = list(bid_to_freedom_115.values()) \n",
    "final_df['GOP_Leadership_Cong115'] = list(bid_to_gop_lead_115.values()) \n",
    "final_df['DEM_Leadership_Cong115'] = list(bid_to_dem_lead_115.values()) \n",
    "final_df['CommitteeChair_Cong115'] = list(bid_to_comm_chair_115.values()) \n",
    "final_df['TopCommittee_Cong115'] = list(bid_to_top_comm_115.values())\n",
    "\n",
    "final_df['Progressive_Caucus_Cong116'] = list(bid_to_progressive_116.values()) \n",
    "final_df['NewDems_Caucus_Cong116'] = list(bid_to_newdem_116.values()) \n",
    "final_df['BlueDog_Caucus_Cong116'] = list(bid_to_bluedog_116.values()) \n",
    "final_df['ProblemSolvers_Caucus_Cong116'] = list(bid_to_problemsolvers_116.values()) \n",
    "final_df['RSC_Caucus_Cong116'] = list(bid_to_rsc_116.values()) \n",
    "final_df['Freedom_Caucus_Cong116'] = list(bid_to_freedom_116.values()) \n",
    "final_df['GOP_Leadership_Cong116'] = list(bid_to_gop_lead_116.values()) \n",
    "final_df['DEM_Leadership_Cong116'] = list(bid_to_dem_lead_116.values()) \n",
    "final_df['CommitteeChair_Cong116'] = list(bid_to_comm_chair_116.values()) \n",
    "final_df['TopCommittee_Cong116'] = list(bid_to_top_comm_116.values())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 150,
   "id": "f9dbd115-879d-497e-a973-134494da2dbf",
   "metadata": {},
   "outputs": [],
   "source": [
    "final_df.to_csv('legislator_info_and_tbip_congresses_115_and_116.csv', \n",
    "                index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 151,
   "id": "9bf8cf65-fe7f-4f72-8272-4acae6d00b4d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 503 entries, 0 to 502\n",
      "Data columns (total 55 columns):\n",
      " #   Column                                        Non-Null Count  Dtype  \n",
      "---  ------                                        --------------  -----  \n",
      " 0   Bioguide_ID                                   503 non-null    object \n",
      " 1   Name                                          503 non-null    object \n",
      " 2   Gender                                        503 non-null    object \n",
      " 3   Party                                         503 non-null    object \n",
      " 4   Born                                          503 non-null    int64  \n",
      " 5   Number_of_House_Terms                         503 non-null    int64  \n",
      " 6   Present_Cong115                               503 non-null    int64  \n",
      " 7   Present_Cong116                               503 non-null    int64  \n",
      " 8   House_Election_Candidate_Vote_Share_2016      418 non-null    float64\n",
      " 9   House_Election_Candidate_Vote_Share_2018      420 non-null    float64\n",
      " 10  District                                      503 non-null    object \n",
      " 11  District_Presidential_VoteShare_Dem2016       503 non-null    float64\n",
      " 12  District_Presidential_VoteShare_GOP2016       503 non-null    float64\n",
      " 13  District_Density                              503 non-null    object \n",
      " 14  District_Percent_Female                       503 non-null    float64\n",
      " 15  District_Percent_Male                         503 non-null    float64\n",
      " 16  District_Percent_Senior                       503 non-null    float64\n",
      " 17  District_Median_Age                           503 non-null    float64\n",
      " 18  District_Percent_White                        503 non-null    float64\n",
      " 19  District_Percent_Black                        503 non-null    float64\n",
      " 20  District_Percent_Asian                        503 non-null    float64\n",
      " 21  District_Percent_Hispanic                     503 non-null    float64\n",
      " 22  District_Percent_AmericanIndian               503 non-null    float64\n",
      " 23  District_Mean_Income                          503 non-null    int64  \n",
      " 24  District_Mean_Minus_National_Mean_Income      503 non-null    int64  \n",
      " 25  District_Median_Income                        503 non-null    int64  \n",
      " 26  District_Median_Minus_National_Median_Income  503 non-null    int64  \n",
      " 27  District_Unemployment_Rate                    503 non-null    float64\n",
      " 28  DW-Nominate_1                                 503 non-null    float64\n",
      " 29  DW-Nominate_2                                 503 non-null    float64\n",
      " 30  TBIP_Floor_Speeches                           484 non-null    float64\n",
      " 31  TBIP_Tweets                                   471 non-null    float64\n",
      " 32  Standardized_Vote_Ideal_Point                 503 non-null    float64\n",
      " 33  Standardized_Speech_Ideal_Point               484 non-null    float64\n",
      " 34  Standardized_Tweet_Ideal_Point                471 non-null    float64\n",
      " 35  Progressive_Caucus_Cong115                    503 non-null    int64  \n",
      " 36  NewDems_Caucus_Cong115                        503 non-null    int64  \n",
      " 37  BlueDog_Caucus_Cong115                        503 non-null    int64  \n",
      " 38  ProblemSolvers_Caucus_Cong115                 503 non-null    int64  \n",
      " 39  RSC_Caucus_Cong115                            503 non-null    int64  \n",
      " 40  Freedom_Caucus_Cong115                        503 non-null    int64  \n",
      " 41  GOP_Leadership_Cong115                        503 non-null    int64  \n",
      " 42  DEM_Leadership_Cong115                        503 non-null    int64  \n",
      " 43  CommitteeChair_Cong115                        503 non-null    int64  \n",
      " 44  TopCommittee_Cong115                          503 non-null    int64  \n",
      " 45  Progressive_Caucus_Cong116                    503 non-null    int64  \n",
      " 46  NewDems_Caucus_Cong116                        503 non-null    int64  \n",
      " 47  BlueDog_Caucus_Cong116                        503 non-null    int64  \n",
      " 48  ProblemSolvers_Caucus_Cong116                 503 non-null    int64  \n",
      " 49  RSC_Caucus_Cong116                            503 non-null    int64  \n",
      " 50  Freedom_Caucus_Cong116                        503 non-null    int64  \n",
      " 51  GOP_Leadership_Cong116                        503 non-null    int64  \n",
      " 52  DEM_Leadership_Cong116                        503 non-null    int64  \n",
      " 53  CommitteeChair_Cong116                        503 non-null    int64  \n",
      " 54  TopCommittee_Cong116                          503 non-null    int64  \n",
      "dtypes: float64(21), int64(28), object(6)\n",
      "memory usage: 216.3+ KB\n"
     ]
    }
   ],
   "source": [
    "final_df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a74fc9ea-0c0b-450b-9e55-81b4f33bab64",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:pg3] *",
   "language": "python",
   "name": "conda-env-pg3-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
